library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(hms)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
##
## hms
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "virids"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d
years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)
importing_data = function(x){
if(str_detect(x, str_c(years_1, collapse = "|"))) {
read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc")
}
else if(str_detect(x, str_c(years_2, collapse = "|"))){
read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
}
}
boston_df <-
tibble(list.files("data", full.names = TRUE)) %>%
setNames("file_name") %>%
mutate(data = map(file_name, importing_data)) %>%
unnest(data) %>%
mutate(year = readr::parse_number(file_name),
city = coalesce(city, residence),
display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>%
filter(!is.na(display_name)) %>%
select(-file_name, -residence)
winners_df =
boston_df %>%
mutate(
year = as.factor(year),
official_time = as_hms(official_time),
pace = as_hms(pace),
place_overall = as.numeric(place_overall)
)
Make winners over time plot
winners_df %>%
filter(overall == 1) %>%
arrange(year) %>%
ggplot(aes(x = year, y = official_time, group = 1)) +
geom_point() +
geom_path() +
scale_x_discrete(breaks = c(1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020))
winners_df %>%
filter(overall == 1) %>%
arrange(official_time)
## # A tibble: 125 × 30
## display_name age gender pace official_time overall gender_result
## <chr> <int> <chr> <time> <time> <int> <int>
## 1 Jim Knaub NA M 03'08" 01:22:17 1 1
## 2 Franz Nietlispach NA M 03'16" 01:25:59 1 1
## 3 Geoffrey Mutai 29 M 04'41" 02:03:02 1 1
## 4 Martin E Duffy NA M 04'45" 02:04:54 1 1
## 5 Robert Kiprono Cheruiyot 21 M 04'48" 02:05:52 1 1
## 6 Robert Cheruiyot 27 M 04'51" 02:07:14 1 1
## 7 Cosmas Ndeti NA M 04'51" 02:07:15 1 1
## 8 Moses Tanui NA M 04'51" 02:07:34 1 1
## 9 Steve Scannapieco 29 M 04'52" 02:07:46 1 1
## 10 Valerie Park NA M 04'52" 02:07:46 1 1
## # … with 115 more rows, and 23 more variables: division_result <int>,
## # seconds <int>, first_name <chr>, last_name <chr>, place_overall <dbl>,
## # bib <chr>, name <chr>, city <chr>, state <chr>, country_residence <chr>,
## # contry_citizenship <chr>, name_suffix <chr>, 5k <chr>, 10k <chr>,
## # 15k <chr>, 20k <chr>, half <chr>, 25k <chr>, 30k <chr>, 35k <chr>,
## # 40k <chr>, projected_time <chr>, year <fct>
Jim Knaub and Franz Nietlispach reported as having an official time of 1:22:17 and 1:25:59 but 1:59 is the fastest marthon ever ran (though not recorded officially)
–> errors in data (~1:20:00 is fastes here but 1:59 is fasted – but not recorded – https://www.nytimes.com/2019/10/12/sports/eliud-kipchoge-marathon-record.html)
Plotly
Fixing time variable – plotly can’t use lubridate
plotly_win_df =
boston_df %>%
mutate(
official_time = as.POSIXct(official_time, format = "%H:%M:%OS"),
year = as.factor(year),
pace = as.POSIXct(pace, format = "%H:%M:%OS"),
place_overall = as.numeric(place_overall)
)
making new plotly with reformatted time
boston_winners =
plotly_win_df %>%
filter(overall == 1) %>%
plot_ly(x = ~year, y = ~format(official_time, "%H:%M:%OS"),
mode = 'lines', type = 'scatter',
name = 'Boston Winners',
hoverinfo = "text",
text =
~paste0("Name: ", display_name,
"\n", "Year: ", year,
"\n", "Time: ", format(official_time, "%H:%M:%OS"),
"\n", "Pace: ", format(pace, "%H:%M:%OS"))) %>%
layout(
title = "Boston Marathon Winners by Year",
xaxis = list(title = "Year"),
yaxis = list(title = "Official Time"))
Marathon records – adding new dataset
records_html =
read_html("https://www.topendsports.com/sport/athletics/record-marathon.htm")
record_marathon =
records_html %>%
html_nodes("table") %>%
html_table(fill = T) %>%
lapply(., function(x) setNames(x, c("time", "date", "athlete", "country", "marathon")))
marathon =
record_marathon %>%
as.data.frame() %>%
mutate(
time = as_hms(time),
) %>%
separate(date, into = c("month", "day", "year")) %>%
mutate(year = as.numeric(year)) %>%
select(-month, -day)
marathon %>%
plot_ly(type = 'scatter', mode = 'lines', text = ~paste('Name: ', athlete)) %>%
add_trace(x = ~year, y = ~time)%>%
layout(yaxis = list(categoryorder = "array", categoryarray = winners_df$official_time))
library(purrr)
library(lubridate)
age x year (intervals?) pace x year plot? Boston winner compared to record winner overall -